library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.3 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(stringr)
library(ggrepel)
humandf <- read_tsv("human_biomart.txt")
## Rows: 67128 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): Gene stable ID, Chromosome/scaffold name
## dbl (2): Gene start (bp), Gene end (bp)
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
zfishdf <- read_tsv("zfish_biomart.txt")
## Rows: 37241 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): Gene stable ID, Chromosome/scaffold name
## dbl (2): Gene start (bp), Gene end (bp)
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
rbhdf <- read_tsv("Human_Zebrafish_RBH.tsv")
## Rows: 5476 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): Human Gene ID, Human Protein ID, Human Gene Name, Zebrafish Gene ID...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(humandf)[1] <- 'Human Gene ID'
names(humandf)[2] <- 'Human Chrom'
names(humandf)[3] <- 'Human Gene Start'
names(humandf)[4] <- 'Human Gene End'
names(zfishdf)[1] <- 'Zebrafish Gene ID'
names(zfishdf)[2] <- 'Zebrafish Chrom'
names(zfishdf)[3] <- 'Zebrafish Gene Start'
names(zfishdf)[4] <- 'Zebrafish Gene End'
merged <- left_join(left_join(rbhdf, zfishdf), humandf)
## Joining, by = "Zebrafish Gene ID"
## Joining, by = "Human Gene ID"
a <- table(merged$`Zebrafish Chrom`)
b <- table(merged$`Human Chrom`)
zfish_chrom_count <- a[order(-as.numeric(names(a)), decreasing = TRUE)]
## Warning in order(-as.numeric(names(a)), decreasing = TRUE): NAs introduced by
## coercion
human_chrom_count <- b[order(-as.numeric(names(b)), decreasing = TRUE)]
## Warning in order(-as.numeric(names(b)), decreasing = TRUE): NAs introduced by
## coercion
zfish_chrom_count <- as.data.frame(zfish_chrom_count[1:25])
human_chrom_count <- as.data.frame(human_chrom_count[1:22])
names(zfish_chrom_count)[1] <- 'Zebrafish Chromosome Number'
names(human_chrom_count)[1] <- 'Human Chromosome Number'
print(zfish_chrom_count)
## Zebrafish Chromosome Number Freq
## 1 1 253
## 2 2 264
## 3 3 236
## 4 4 195
## 5 5 326
## 6 6 251
## 7 7 272
## 8 8 223
## 9 9 224
## 10 10 186
## 11 11 187
## 12 12 198
## 13 13 242
## 14 14 172
## 15 15 163
## 16 16 207
## 17 17 225
## 18 18 197
## 19 19 175
## 20 20 265
## 21 21 216
## 22 22 136
## 23 23 180
## 24 24 163
## 25 25 194
print(human_chrom_count)
## Human Chromosome Number Freq
## 1 1 550
## 2 2 433
## 3 3 354
## 4 4 236
## 5 5 279
## 6 6 276
## 7 7 273
## 8 8 175
## 9 9 220
## 10 10 255
## 11 11 296
## 12 12 305
## 13 13 83
## 14 14 177
## 15 15 192
## 16 16 245
## 17 17 313
## 18 18 84
## 19 19 228
## 20 20 150
## 21 21 48
## 22 22 130
n_tbl <- tibble(zfish_start = NA,
human_chrom = NA,
zfish_chrom = NA)
for (i in seq(1:25)){
index <- which(merged$`Zebrafish Chrom` == i)
for (x in index) {
n_tbl%>%
add_row(zfish_start = merged[x, ]$`Zebrafish Gene Start`,
human_chrom = merged[x, ]$`Human Chrom`,
zfish_chrom = merged[x, ]$`Zebrafish Chrom`)->n_tbl
}
}
grouped_n_tbl <- n_tbl %>%
group_by(zfish_chrom)
ordered_n_tbl <- grouped_n_tbl[order(as.numeric(grouped_n_tbl$"zfish_chrom")), ]
d_list <- group_split(ordered_n_tbl)
for (i in 1:25) {
a <- tibble(zfish_start <- d_list[[i]][,1],
human_chrom <- d_list[[i]][,2])
plot_name <- as.character(d_list[[i]][,3][1,])
a <- a[!str_detect(a$human_chrom, '([A-Za-z])'), ]
a <- a[!str_detect(a$human_chrom, '\\.$'), ]
a$human_chrom <- unlist(as.numeric(a$human_chrom))
a$zfish_start <- unlist(as.numeric(a$zfish_start))
a <- a[order(a$human_chrom),]
title1 <- paste("\nPutative Orthologs Between Zebrafish Chromosome",i)
title2 <- paste(title1, "and All Human Chromosomes\n")
print( ggplot(a, aes(x = factor(zfish_start), y = factor(human_chrom), fill=factor(human_chrom))) +
geom_dotplot(binaxis = "y",
stackdir = "centerwhole",
binwidth = 0.38, color=NA) +
scale_x_discrete(guide = guide_axis(check.overlap = TRUE)) +
theme(axis.text.x = element_text(size=7, angle = 90, hjust = 1),
legend.position = "none",
plot.title = element_text(size = 10, face = "bold")) +
xlab("\nGene Start Positions As Basepair #") +
ylab("Human Chromosome #") +
ggtitle(title2))
}
























